library(tidyverse)
library(bigrquery)
library(wesanderson)
library(plotly)
library(hrbrthemes)
library(patchwork)
library(flextable)
library(ggstatsplot)
library(GGally)
library(viridis)
library(lubridate)
library(geosphere)
project_id <- "tc-da-1"
dataset_id <- "olist_db"
con <- dbConnect(
bigrquery::bigquery(),
project = project_id,
dataset = dataset_id
)
# to close the connection: dbDisconnect(con)
select
price, freight_value,
round(freight_value / (freight_value + price),2) as freight_ratio,
from
`olist_db.olist_order_items_dataset`df <- df_freight_ratio
# # Simple histogram - refined below by calculating the values manually
# hist(df$freight_ratio, main = "Number of order in different ranges of shipping cost",
# xlab = "freight ratio = freight cost / (price + freight cost)",
# ylab = "number of orders", col = "steelblue", border = "white")
# abline(v = 0.19, col = "red")
# text(x = 0.19, y = par("usr")[4] - 2500, labels = "Median = 0.19", pos = 4, col = "red")
# Calculate revenue and number of orders - as well as their proportion and
# cumulative sum - for different segments of freight ratio
fr_bin_value <- 0.05
ddf <- df %>%
mutate(fr_bin = cut(freight_ratio, breaks = seq(0, 1, by = fr_bin_value)) ) %>%
group_by(fr_bin) %>%
mutate(fr_bin = max(freight_ratio) ) %>%
reframe(
revenue = sum(price),
n_orders = n(),
prop_orders = n() / nrow(df),
prop_total_revenue = sum(price) / sum(df$price)
) %>%
na.omit() %>%
arrange(fr_bin) %>%
mutate(
cum_revenue = cumsum(revenue),
cum_prop_revenue = cumsum(prop_total_revenue)
)
scale_dual_axis = 0.2
ddf %>%
ggplot(aes(x = fr_bin)) +
geom_bar(aes(y = prop_orders), stat = "identity", fill = "steelblue", color = "white", width = fr_bin_value) +
geom_line(aes(y = cum_prop_revenue*scale_dual_axis), color = "red") +
scale_y_continuous(
sec.axis = sec_axis(~./scale_dual_axis, name = "% Cumulative Revenue", breaks = seq(0, 1, 0.1))
) +
theme_minimal() +
scale_x_continuous(breaks = seq(0, 1, fr_bin_value)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
coord_cartesian(xlim = c(0, 0.8)) +
labs(
title = "Orders for different segments of Freight Ratio",
x = "Freight ratio segments",
y = "Proportion of total Orders"
) + theme(
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)df %>%
mutate(price_qtile = ntile(price, 4)) %>%
group_by(price_qtile) %>%
mutate(price_range = paste0(min(price),"-",max(price)) %>% factor) %>%
ggplot(aes(x = freight_ratio, fill = as.factor(price_range)) ) +
geom_histogram(binwidth = 0.02, alpha = 1) +
# stat_bin(aes(y=..count../sum(..count..)),geom="step", binwidth = 0.02) +
# geom_density(alpha = 0.7) +
# facet_wrap(~ price_range, ncol = 2) +
labs(title = "Histogram of Freight Ratio by Price Range",
x = "Freight Ratio",
y = "Number of Orders",
fill = "Segments of orders \nby value (quartiles)") +
theme_minimal() +
scale_fill_manual(values = wesanderson::wes_palette("Zissou1")) +
theme(legend.position = c(0.7,0.7))# Freight ratio in 4 categories of products of increasing price
df %>%
mutate(price_qtile = ntile(price, 4)) %>%
group_by(price_qtile) %>%
mutate(price_range = paste0(min(price),"-",max(price)) %>% factor) %>%
ggplot(aes(x = price_range, y = freight_ratio)) +
geom_boxplot(color = "steelblue") +
stat_summary(
fun.data = function(y) data.frame(y = median(y), label = median(y)),
geom = "text", hjust = 4, vjust = 0.5, color = "black", size = 3
) +
theme_minimal() +
labs(
title = "Freight ratio for orders of different value",
x = "Price range (quartiles)",
y = "Freight ratio"
)# Freight ratio decrease very quick with order value
df %>%
ggplot(aes(x = price, y = freight_ratio)) +
geom_point(alpha = 0.1, color = "steelblue") +
# theme_minimal() +
theme_minimal() +
scale_x_continuous(
trans = "log",
labels = function(x) round(x,2),
breaks = 2^(0:12)
) +
labs(
title = "Very high shipping fees for orders < 100 BR$",
x = "Order value (log)",
y = "Freight ratio"
)# # Alternative representation: 2D histogram (to highlight counts)
# df %>%
# # filter(freight_ratio >= 0.2) %>%
# ggplot(aes(x = freight_ratio, y = price)) +
# geom_hex(bins = 30) +
# scale_fill_continuous(type = "viridis", trans = "log", breaks = 10^seq(4)) +
# scale_y_continuous(trans = "log", breaks = 10^seq(3) ) +
# scale_x_continuous(breaks = seq(0,1,0.1)) +
# theme_minimal() +
# coord_fixed(ratio = 0.1) +
# theme(
# axis.text.x = element_text(size = 12),
# axis.text.y = element_text(size = 12),
# panel.grid.minor = element_blank()
# )
# # Original (not log transformed) values
# df %>%
# filter(freight_ratio > 0, price < 1500) %>%
# sample_frac(0.1) %>%
# ggplot(aes(x = price, y = freight_ratio)) +
# geom_point(alpha = 0.1, color = "steelblue") +
# theme_minimal()pct_decrease_fr <- 0.05
ddf <- df %>%
rename(fr = freight_ratio) %>%
filter(fr > 0) %>%
mutate(lower_fr = round(fr * (1 - pct_decrease_fr),2) ) %>%
add_count(fr, name = "cnt_fr") %>%
add_count(lower_fr, name = "cnt_lower_fr") %>%
mutate(order_increase = cnt_lower_fr / cnt_fr) %>%
group_by(fr) %>%
reframe(
AOV = sum(price) / n(),
current_revenue = round(cnt_fr * AOV,2) %>% unique,
new_revenue = round(cnt_lower_fr * AOV,2) %>% unique
) %>%
# express in deciles instead of percentiles of freight_ratio
mutate(ntile_fr = ntile(fr,10)) %>%
group_by(ntile_fr) %>%
mutate(fr_range = paste(min(fr)*100,"-", max(fr)*100,"%")) %>%
group_by(fr_range) %>%
reframe(
current_revenue = sum(current_revenue),
new_revenue = sum(new_revenue)
)
pct_increase_revenue <- round((sum(ddf$new_revenue) / sum(ddf$current_revenue) - 1) * 100,2)
ddf %>%
pivot_longer(cols = c("current_revenue","new_revenue"), names_to = "revenue") %>%
ggplot(aes(x = fr_range, y = value, fill = revenue)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels
axis.title.y = element_text(angle = 90)) +
scale_y_continuous(labels = scales::comma_format()) +
labs(
title = paste0("Expected increase in revenue: ",pct_increase_revenue,"%"),
subtitle = paste0("when freight ratio is lowered by ",pct_decrease_fr*100,"%"),
x = "Freight ratio segment", y = "revenue per segment"
) +
theme(
legend.position = c(0.7,0.7),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)Olist generally takes a cut of 10% on the price of the item (shipping fees excluded).
pct_decrease_fr <- 0.05
ddf <- df %>%
rename(fr = freight_ratio) %>%
filter(fr > 0) %>%
mutate(freight_discount = freight_value * pct_decrease_fr) %>%
mutate(lower_fr = round(fr * (1 - pct_decrease_fr),2) ) %>%
add_count(fr, name = "cnt_fr") %>%
add_count(lower_fr, name = "cnt_lower_fr") %>%
mutate(order_increase = cnt_lower_fr / cnt_fr) %>%
group_by(fr) %>%
reframe(
AOV = sum(price) / n(),
AO_freight_discount = sum(freight_discount) / n(),
current_revenue = round(cnt_fr * AOV,2) %>% unique,
new_revenue = round(cnt_lower_fr * AOV,2) %>% unique,
freight_loss = round(cnt_lower_fr * AO_freight_discount,2) %>% unique
) %>%
# express in deciles instead of percentiles of freight_ratio
mutate(ntile_fr = ntile(fr,10)) %>%
group_by(ntile_fr) %>%
mutate(fr_range = paste(min(fr)*100,"-", max(fr)*100,"%")) %>%
group_by(fr_range) %>%
reframe(
current_revenue = sum(current_revenue),
new_revenue = sum(new_revenue),
total_freight_loss = sum(freight_loss)
)
# Applying the freight discount on all freight ratio segments
ddf %>%
mutate(
current_profit = current_revenue * 0.10,
new_profit = (new_revenue * 0.10) - total_freight_loss
) %>%
summarise(
pct_increase_profit = (sum(new_profit) - sum(current_profit)) / sum(current_profit) * 100,
profit_increase = sum(current_profit) * pct_increase_profit / 100
) %>% flextable() %>%
add_header_row(values = c("Apply discount to all segments",""))Apply discount to all segments | |
|---|---|
pct_increase_profit | profit_increase |
1.519294 | 20,564.7 |
# Applying the freight discount only to the 10-18% freight ratio
ddf %>%
mutate(
current_profit = current_revenue * 0.10,
new_profit = ifelse(
fr_range == "10 - 18 %", new_revenue * 0.10, current_revenue * 0.10
)
) %>%
summarise(
pct_increase_profit = (sum(new_profit) - sum(current_profit)) / sum(current_profit) * 100,
profit_increase = sum(current_profit) * pct_increase_profit / 100
) %>% flextable() %>%
add_header_row(values = c("Apply discount to 10-18% fr segment",""))Apply discount to 10-18% fr segment | |
|---|---|
pct_increase_profit | profit_increase |
9.068499 | 122,748.4 |
There is actually no difference in the reviews for sellers with different freight ratio
select
t1.order_id,
round(t1.freight_value / (t1.freight_value + t1.price),2) as freight_ratio,
review_score as n_stars
from `olist_db.olist_order_items_dataset` t1 join `olist_db.olist_order_reviews_dataset` t2
on t1.order_id = t2.order_id# df_FR_stars %>%
# mutate(qtile_FR = ntile(freight_ratio, 4)) %>%
# group_by(qtile_FR, n_stars) %>%
# reframe(
# cnt = n()
# ) %>%
# ungroup %>% group_by(qtile_FR) %>%
# reframe(
# n_stars = unique(n_stars),
# prop = cnt / sum(cnt)
# )
ddf_stars <- df_FR_stars %>%
mutate(qtile_FR = ntile(freight_ratio, 4)) %>%
add_count(qtile_FR, name = "n_orders") %>%
group_by(qtile_FR, n_stars) %>%
reframe(
prop = unique(n() / n_orders)
)
# # Quartiles of FR
# ddf_stars <- df_FR_stars %>%
# mutate(qtile_FR = ntile(freight_ratio, 4))
#
# ggbarstats(
# y = qtile_FR,
# x = n_stars,
# data = ddf_stars,
# results.subtitle = F,
# label = "percent",
# caption = F
# )
# High and low FR
df_FR_stars %>%
mutate(FR_segment = ifelse(freight_ratio <= 0.2, "low","high")) %>%
ggbarstats(
y = FR_segment,
x = n_stars,
results.subtitle = F,
label = "percent"
) +
# scale_fill_viridis_d()
scale_fill_brewer(palette = "Dark1", direction = -1) + # Set1,2,3, Dark1,2,3, Paired, Accent, Pastel1,2,
theme(axis.text.x = element_text(size = 12)) +
labs(
title = "Number of Stars in Reviews for high and low freight ratio",
x = "Freight ratio"
)/* Distance between seller and customer */
-- NB the lat/lng for each zipcode are averaged
with
geo as
(
select
distinct(geolocation_zip_code_prefix) zipcode,
avg(geolocation_lat) as lat,
avg(geolocation_lng) as lng,
from `olist_db.olist_geolocation_dataset`
group by geolocation_zip_code_prefix
order by
geolocation_zip_code_prefix
),
seller_info as
(
select
distinct(sellers.seller_id),
seller_zip_code_prefix as seller_zipcode,
geo.lat as seller_lat,
geo.lng as seller_lng,
seller_city,
seller_state,
order_id,
price,
freight_value,
product_weight_g,
product_length_cm * product_height_cm * product_width_cm as product_volume_cm3
from `olist_db.olist_sellers_dataset` sellers join `olist_db.olist_order_items_dataset` items
on sellers.seller_id = items.seller_id
join geo on sellers.seller_zip_code_prefix = geo.zipcode
join `olist_db.olist_products_dataset` products on items.product_id = products.product_id
),
customer_info as
(
select
distinct(cust.customer_id),
customer_zip_code_prefix as customer_zipcode,
customer_city,
customer_state,
order_id,
date(order_purchase_timestamp) as date_purchased,
date(order_delivered_customer_date) as date_delivered,
geo.lat as customer_lat,
geo.lng as customer_lng
from
`olist_db.olist_customesr_dataset` cust join `olist_db.olist_orders_dataset` orders
on cust.customer_id = orders.customer_id
join geo on cust.customer_zip_code_prefix = geo.zipcode
)
select
*
from seller_info join customer_info
on seller_info.order_id = customer_info.order_id
;library(lubridate)
df <- df_sql %>%
select(-order_id_1) %>%
mutate(fr = freight_value / (price + freight_value)) %>% # calculate freigth_ratio
mutate(fr_segment = ifelse(fr <= 0.2, "low","high")) %>% # set high and low fr (<>0.2)
mutate(days_to_deliver = interval(date_purchased, date_delivered) %>% as.numeric() / (60 * 60 * 24)) %>%
filter(!is.na(days_to_deliver))
# df %>% glimpse
library(geosphere)
# Function to calculate distance using distGeo
# s_lng/lat = seller; c_lng/lat = customer
calculate_distance <- function(s_lng, s_lat, c_lng, c_lat) {
distm(
matrix(c(s_lng, s_lat), ncol = 2),
matrix(c(c_lng, c_lat), ncol = 2),
fun = distGeo
)
}
D_vector <- list(
df$seller_lng, df$seller_lat,
df$customer_lng, df$customer_lat
) %>%
pmap_dbl(calculate_distance, .progress = TRUE)## â– â– â– â– â– 13% | ETA: 16s
## â– â– â– â– â– â– â– â– â– â– 29% | ETA: 13s
## â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 45% | ETA: 10s
## â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 61% | ETA: 7s
## â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 77% | ETA: 4s
## â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 93% | ETA: 1s
## seller_zipcode seller_lat seller_lng price
## Min. : 1001 Min. :-32.079 Min. :-63.89 Min. : 0.85
## 1st Qu.: 6506 1st Qu.:-23.613 1st Qu.:-48.83 1st Qu.: 40.00
## Median :13660 Median :-23.419 Median :-46.76 Median : 78.00
## Mean :24616 Mean :-22.795 Mean :-47.24 Mean : 123.92
## 3rd Qu.:29154 3rd Qu.:-21.757 3rd Qu.:-46.52 3rd Qu.: 139.00
## Max. :99730 Max. : -2.501 Max. :-34.86 Max. :6735.00
##
## freight_value product_weight_g product_volume_cm3 customer_zipcode
## Min. : 0.00 Min. : 0 Min. : 168 Min. : 1003
## 1st Qu.: 13.12 1st Qu.: 300 1st Qu.: 2816 1st Qu.:11250
## Median : 16.32 Median : 700 Median : 6358 Median :24350
## Mean : 20.07 Mean : 2087 Mean : 15130 Mean :35057
## 3rd Qu.: 21.19 3rd Qu.: 1800 3rd Qu.: 18200 3rd Qu.:58415
## Max. :409.68 Max. :40425 Max. :296208 Max. :99980
## NA's :16 NA's :16
## customer_lat customer_lng fr days_to_deliver
## Min. :-33.69 Min. :-72.669 Min. :0.0000 Min. : 0.00
## 1st Qu.:-23.59 1st Qu.:-48.110 1st Qu.:0.1162 1st Qu.: 7.00
## Median :-22.92 Median :-46.633 Median :0.1833 Median : 10.00
## Mean :-21.21 Mean :-46.192 Mean :0.2093 Mean : 12.43
## 3rd Qu.:-20.14 3rd Qu.:-43.635 3rd Qu.:0.2771 3rd Qu.: 16.00
## Max. : 42.18 Max. : -8.724 Max. :0.9633 Max. :210.00
##
## distance_km
## Min. : 0.0
## 1st Qu.: 187.2
## Median : 433.0
## Mean : 598.4
## 3rd Qu.: 796.0
## Max. :8652.0
##
describe <- function(column, take_log = 0) {
par(mfrow = c(1,2))
var <- df[[column]]
if(take_log == 1) {var = log(var)}
boxplot(var, main = column)
hist(var)
}
describe("days_to_deliver", 0)## Rows: 99,478
## Columns: 23
## $ seller_id <chr> "0cbcee27c791afa0cdcb08587a2013a8", "9bcdfa7b615b3a…
## $ seller_zipcode <int> 37410, 89053, 87502, 24020, 29156, 85863, 37564, 93…
## $ seller_lat <dbl> -21.69337, -26.87730, -23.75942, -22.89370, -20.278…
## $ seller_lng <dbl> -45.25977, -49.08145, -53.29278, -43.12042, -40.411…
## $ seller_city <chr> "tres coracoes", "blumenau", "umuarama", "niteroi",…
## $ seller_state <chr> "MG", "SC", "PR", "RJ", "ES", "PR", "MG", "RS", "SC…
## $ order_id <chr> "2c45c33d2f9cb8ff8b1c86cc28c11c30", "c158e9806f85a3…
## $ price <dbl> 135.00, 79.90, 279.90, 689.90, 84.90, 279.00, 119.9…
## $ freight_value <dbl> 18.51, 21.01, 16.13, 98.81, 36.14, 16.12, 17.58, 30…
## $ product_weight_g <int> 2250, 300, 500, 22350, 8325, 3800, 1350, 300, 50, 1…
## $ product_volume_cm3 <int> 6600, 3500, 3600, 228480, 19866, 55352, 14350, 2160…
## $ customer_id <chr> "de4caa97afa80c8eeac2ff4c8da5b72e", "25456ee3b0cf84…
## $ customer_zipcode <int> 88058, 74475, 3380, 37142, 18078, 13224, 28740, 638…
## $ customer_city <chr> "florianopolis", "goiania", "sao paulo", "divisa no…
## $ customer_state <chr> "SC", "GO", "SP", "MG", "SP", "SP", "RJ", "CE", "SP…
## $ date_purchased <date> 2016-10-09, 2017-04-14, 2017-04-26, 2017-04-22, 20…
## $ date_delivered <date> 2016-11-09, 2017-05-08, 2017-05-08, 2017-05-12, 20…
## $ customer_lat <dbl> -27.442895, -16.611184, -23.576068, -21.509981, -23…
## $ customer_lng <dbl> -48.40148, -49.32568, -46.53257, -46.19579, -47.467…
## $ fr <dbl> 0.120578464, 0.208205331, 0.054487721, 0.125280521,…
## $ fr_segment <chr> "low", "high", "low", "low", "high", "low", "low", …
## $ days_to_deliver <dbl> 31, 24, 12, 20, 9, 15, 13, 44, 14, 18, 15, 41, 17, …
## $ distance_km <dbl> 712, 1137, 690, 352, 809, 825, 444, 2980, 436, 808,…
With relation to freight ratio
##
## Call:
## lm(formula = days_to_deliver ~ poly(distance_km, 2), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.413 -4.154 -1.546 2.564 32.942
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.28095 0.02063 546.75 <2e-16 ***
## poly(distance_km, 2)1 899.71365 6.27180 143.45 <2e-16 ***
## poly(distance_km, 2)2 -307.22799 6.27180 -48.99 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.272 on 92395 degrees of freedom
## Multiple R-squared: 0.1992, Adjusted R-squared: 0.1991
## F-statistic: 1.149e+04 on 2 and 92395 DF, p-value: < 2.2e-16
# # Regular scatterplot. The correlation is invisible
# df %>%
# sample_frac(0.5) %>%
# select(distance_km, days_to_deliver) %>%
# ggplot(aes(x = distance_km, y = days_to_deliver)) +
# geom_point(color = "steelblue", alpha = 0.7) +
# theme_minimal()
# 2D histogram. The correlation becomes evident
# See
# https://stackoverflow.com/questions/54092169/how-to-plot-ggplots-hexagon-only-if-number-greater-than-threshold
df %>%
sample_frac(1) %>%
# filter(fr_segment == "high") %>%
select(distance_km, days_to_deliver, fr_segment) %>%
mutate(distance_bin = cut(distance_km, breaks = seq(0, 2500, by = 10))) %>%
group_by(distance_bin, fr_segment) %>%
mutate(distance_bin = max(distance_km)) %>%
ggplot(aes(x = distance_bin, y = days_to_deliver,
fill = cut(..count.., 2^seq(4,15)) )) +
geom_hex(bins = 30) +
scale_fill_viridis_d(labels = list(2^seq(4,11),2^seq(5,12)) %>% pmap(~ paste0(.x, "-", .y))) +
# scale_fill_continuous(type = "viridis", trans = "log", breaks = 2^seq(15)) +
theme_minimal() +
coord_fixed(ratio = 2000 / 40) +
labs(
title = "Number of orders delivered in N days for each distance",
subtitle = "by freight ratio segment",
x = "Distance (km)",
y = "Days to deliver",
fill = "Number of\nOrders"
) +
facet_wrap(vars(fr_segment), ncol = 2) +
theme(strip.text = element_text(size = 12))## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# # Continous version - difficult to threshold
# df %>%
# sample_frac(1) %>%
# select(distance_km, days_to_deliver) %>%
# mutate(distance_bin = cut(distance_km, breaks = seq(0, 2500, by = 10))) %>%
# group_by(distance_bin) %>%
# mutate(distance_bin = max(distance_km)) %>%
# ggplot(aes(x = distance_bin, y = days_to_deliver )) +
# geom_hex(bins = 30) +
# scale_fill_continuous(type = "viridis", trans = "log", breaks = 2^seq(15)) +
# theme_minimal() +
# coord_fixed(ratio = 50) +
# labs(
# title = "Number of orders delivered in N days for each distance",
# x = "Distance (km)",
# y = "Days to deliver",
# fill = "Number of\norders"
# )df %>%
sample_frac(1) %>%
select(fr, days_to_deliver, fr_segment) %>%
mutate(delivery_times_pctile = ntile(days_to_deliver, 5)) %>%
ggbarstats(
y = fr_segment,
x = delivery_times_pctile,
label = "percent",
results.subtitle = F
) +
scale_fill_brewer(palette = "Dark1", direction = 1, labels = c("Very Slow","Slow","Average","Fast","Very Fast")) +
theme(axis.text.x = element_text(size = 12)) +
labs(
title = "Delivery times for high and low freight ratio",
x = "Freight ratio segment"
) +
guides(fill = guide_legend(title = "Delivery time")) # Change "New Legend Title" to your desired titleSmaller orders have much higher freight ratio, but that is not surprising, and we already knew it.
df %>%
sample_frac(1) %>%
select(price, fr, fr_segment) %>%
filter(fr > 0) %>%
ggbetweenstats(
x = fr_segment,
y = price,
results.subtitle = F
) +
scale_y_continuous(trans = "log") +
theme(axis.text.x = element_text(size = 12))d_sample <- df %>% sample_frac(0.05)
# Explore using only on a sample of df_distance to save time
d_sample %>%
filter(fr > 0) %>%
select(fr, distance_km, product_weight_g, product_volume_cm3, days_to_deliver) %>%
na.omit() %>%
ggpairs()d_sample_std <- d_sample %>%
mutate_at(c("fr", "distance_km", "product_weight_g", "product_volume_cm3", "days_to_deliver"), .funs=scale)
df2fit <- df %>%
select(fr, distance_km, product_weight_g, product_volume_cm3, days_to_deliver, price) %>%
filter(fr > 0) %>%
mutate(log_fr = log(fr))
# Modelling freight_value also gives a good result
fit <- lm(freight_value ~ distance_km +
product_weight_g +
product_volume_cm3 +
# days_to_deliver +
# fr +
log(price),
data = df)
fit %>% summary##
## Call:
## lm(formula = freight_value ~ distance_km + product_weight_g +
## product_volume_cm3 + log(price), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.22 -3.07 -0.08 2.48 323.33
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.957e-01 1.616e-01 -1.211 0.226
## distance_km 1.168e-02 7.662e-05 152.403 <2e-16 ***
## product_weight_g 1.479e-03 1.463e-05 101.044 <2e-16 ***
## product_volume_cm3 1.671e-04 2.296e-06 72.801 <2e-16 ***
## log(price) 1.822e+00 3.823e-02 47.650 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.61 on 92378 degrees of freedom
## (15 observations deleted due to missingness)
## Multiple R-squared: 0.5637, Adjusted R-squared: 0.5637
## F-statistic: 2.984e+04 on 4 and 92378 DF, p-value: < 2.2e-16
## distance_km product_weight_g product_volume_cm3 log(price)
## 1.013149 2.959883 2.841904 1.218997
# Modelling freight_ratio
# Price is by far the most important factor (negatively correlated!): 19% of the variance
fit <- lm(fr ~ price, data = df2fit)
summary(fit)##
## Call:
## lm(formula = fr ~ price, data = df2fit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.23568 -0.08162 -0.02921 0.05242 1.75412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.412e-01 4.385e-04 550.1 <2e-16 ***
## price -2.921e-04 1.987e-06 -147.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1112 on 92082 degrees of freedom
## Multiple R-squared: 0.19, Adjusted R-squared: 0.19
## F-statistic: 2.16e+04 on 1 and 92082 DF, p-value: < 2.2e-16
# We get close to 80% of explained variance!.
fit <- lm(fr ~
distance_km +
product_weight_g +
# product_volume_cm3 +
# days_to_deliver +
log(price),
data = df2fit)
fit %>% summary##
## Call:
## lm(formula = fr ~ distance_km + product_weight_g + log(price),
## data = df2fit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57900 -0.03513 -0.00837 0.02561 0.48585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.927e-01 9.955e-04 695.8 <2e-16 ***
## distance_km 8.584e-05 4.724e-07 181.7 <2e-16 ***
## product_weight_g 8.575e-06 5.745e-08 149.2 <2e-16 ***
## log(price) -1.263e-01 2.350e-04 -537.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05917 on 92065 degrees of freedom
## (15 observations deleted due to missingness)
## Multiple R-squared: 0.7707, Adjusted R-squared: 0.7707
## F-statistic: 1.031e+05 on 3 and 92065 DF, p-value: < 2.2e-16
## distance_km product_weight_g log(price)
## 1.013073 1.201499 1.214290
# df2fit <- df %>%
# select(fr, distance_km, product_weight_g, product_volume_cm3, days_to_deliver, price) %>%
# filter(fr > 0) %>%
# mutate(log_fr = log(fr))
# We get > 80% of explained variance!.
fit <- lm(fr ~
log(distance_km + 1) +
product_weight_g +
product_volume_cm3 +
# log(days_to_deliver + 1) +
log(price),
data = df2fit)
fit %>% summary##
## Call:
## lm(formula = fr ~ log(distance_km + 1) + product_weight_g + product_volume_cm3 +
## log(price), data = df2fit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48202 -0.03531 -0.01087 0.02513 0.46884
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.832e-01 1.217e-03 479.28 <2e-16 ***
## log(distance_km + 1) 2.721e-02 1.482e-04 183.54 <2e-16 ***
## product_weight_g 6.662e-06 8.973e-08 74.24 <2e-16 ***
## product_volume_cm3 3.682e-07 1.408e-08 26.15 <2e-16 ***
## log(price) -1.271e-01 2.347e-04 -541.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05889 on 92064 degrees of freedom
## (15 observations deleted due to missingness)
## Multiple R-squared: 0.7729, Adjusted R-squared: 0.7729
## F-statistic: 7.832e+04 on 4 and 92064 DF, p-value: < 2.2e-16
## log(distance_km + 1) product_weight_g product_volume_cm3
## 1.015760 2.958916 2.843141
## log(price)
## 1.222443
# Let's make a df with the log of the values we are interested and explore
# the correlations
df2fit <- df %>%
filter(fr > 0, distance_km > 0, price > 0, price < 2000) %>%
mutate(log_fr = log(fr), log_distance_km = log(distance_km), log_price = log(price), fr_pct = fr*100) %>%
select(fr, fr_pct, freight_value, price, distance_km, log_fr, log_price, log_distance_km, product_weight_g, product_volume_cm3) %>%
filter(log_price > 0, log_distance_km > 0)
fit <- lm(
fr ~
log_price +
distance_km +
product_weight_g +
product_volume_cm3,
data = df2fit
)
# log_price explains ~ 64% of the variance in fr
fit <- lm(fr ~ log_price, data = df2fit)
# distance_km explains about 3.5% of the variance
fit <- lm(fr ~ distance_km, data = df2fit)
# together they explain ~ 72% of the variance
fit <- lm(fr ~ log_price + I(distance_km^(0.5)), data = df2fit)
# adding weight and volume explains an additional 5%, to 77% of the variance
# however this leads to a certain extent of violation of lm assumptions
# the best model is obtained with log(1/price) and the sqrt(distance_km) 78% of the variance
fit <- lm(fr ~ log(I(1/log_price)) + I(distance_km^(0.5)), data = df2fit)
# however the most important evidence is that fr is predicted almost entirely from price
# wrt other expected variables, such as distance, weight and volume, so we will use the
# simplest model:
fit <- lm(fr ~ log_price, data = df2fit)
fit %>% summary##
## Call:
## lm(formula = fr ~ log_price, data = df2fit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42723 -0.04452 -0.01306 0.03381 0.51545
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6779265 0.0011887 570.3 <2e-16 ***
## log_price -0.1090671 0.0002689 -405.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07388 on 91870 degrees of freedom
## Multiple R-squared: 0.6417, Adjusted R-squared: 0.6417
## F-statistic: 1.645e+05 on 1 and 91870 DF, p-value: < 2.2e-16
Now freight_value
df2fit <- df %>%
filter(fr > 0, freight_value > 0, distance_km > 0, price > 0, price < 2000) %>%
mutate(log_fr = log(fr), log_freight_value = log(freight_value),
log_distance_km = log(distance_km), log_price = log(price)) %>%
select(fr, freight_value, log_freight_value, price, distance_km,
log_fr, log_price, log_distance_km, product_weight_g, product_volume_cm3) %>%
filter(log_price > 0, log_distance_km > 0)
# Modelling freight_value also gives a good result
fit <- lm(freight_value ~ distance_km +
product_weight_g +
product_volume_cm3 +
log(price),
data = df2fit)
fit %>% summary##
## Call:
## lm(formula = freight_value ~ distance_km + product_weight_g +
## product_volume_cm3 + log(price), data = df2fit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.36 -3.02 -0.14 2.38 324.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.263e-01 1.575e-01 2.706 0.00681 **
## distance_km 1.174e-02 7.430e-05 157.961 < 2e-16 ***
## product_weight_g 1.491e-03 1.421e-05 104.931 < 2e-16 ***
## product_volume_cm3 1.597e-04 2.233e-06 71.485 < 2e-16 ***
## log(price) 1.696e+00 3.729e-02 45.487 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.292 on 91852 degrees of freedom
## (15 observations deleted due to missingness)
## Multiple R-squared: 0.5712, Adjusted R-squared: 0.5711
## F-statistic: 3.058e+04 on 4 and 91852 DF, p-value: < 2.2e-16
## distance_km product_weight_g product_volume_cm3 log(price)
## 1.013187 2.941070 2.828425 1.215112
# note also the positive association between price and dimensions
lm(price ~ product_volume_cm3 + product_weight_g, data = df2fit) %>% summary##
## Call:
## lm(formula = price ~ product_volume_cm3 + product_weight_g, data = df2fit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.03 -61.75 -36.65 13.72 1910.01
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.335e+01 5.791e-01 143.92 <2e-16 ***
## product_volume_cm3 6.445e-04 3.518e-05 18.32 <2e-16 ***
## product_weight_g 1.210e-02 2.195e-04 55.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 146.7 on 91854 degrees of freedom
## (15 observations deleted due to missingness)
## Multiple R-squared: 0.1328, Adjusted R-squared: 0.1328
## F-statistic: 7036 on 2 and 91854 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = freight_value ~ distance_km + product_weight_g +
## product_volume_cm3 + log(price), data = df2fit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.36 -3.02 -0.14 2.38 324.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.263e-01 1.575e-01 2.706 0.00681 **
## distance_km 1.174e-02 7.430e-05 157.961 < 2e-16 ***
## product_weight_g 1.491e-03 1.421e-05 104.931 < 2e-16 ***
## product_volume_cm3 1.597e-04 2.233e-06 71.485 < 2e-16 ***
## log(price) 1.696e+00 3.729e-02 45.487 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.292 on 91852 degrees of freedom
## (15 observations deleted due to missingness)
## Multiple R-squared: 0.5712, Adjusted R-squared: 0.5711
## F-statistic: 3.058e+04 on 4 and 91852 DF, p-value: < 2.2e-16
# car::vif(fit)
# plot(fit)
# freight ratio ggpairs
df2fit %>%
sample_frac(0.01) %>%
select(log_fr, log_price, distance_km, product_volume_cm3, product_weight_g) %>%
ggpairs() +
theme_minimal() +
ggtitle("Freight Ratio associations")# freight value ggpairs
df2fit %>%
sample_frac(0.01) %>%
select(log_freight_value, log_price, distance_km, product_volume_cm3, product_weight_g) %>%
ggpairs() +
theme_minimal() +
ggtitle("Freight Value associations")##
## Call:
## lm(formula = fr ~ poly(distance_km, 2), data = df2fit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25906 -0.08798 -0.02384 0.06355 0.69411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2060344 0.0003991 516.27 <2e-16 ***
## poly(distance_km, 2)1 7.2081861 0.1209629 59.59 <2e-16 ***
## poly(distance_km, 2)2 -1.7793287 0.1209629 -14.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.121 on 91869 degrees of freedom
## Multiple R-squared: 0.03939, Adjusted R-squared: 0.03937
## F-statistic: 1884 on 2 and 91869 DF, p-value: < 2.2e-16
Suggestion: try doing the heatmaps using reactable